## plot expression on t-SNE
bluefunc <- colorRampPalette(c("lightgrey", "dodgerblue4"))
plotGeneOnTSNE <- function(gene, tsne, clusters) {
  id <- row.names(ann[ann$gene == gene,])
  df <- data.frame(x = tsne$x, y = tsne$y, exp = log2(dataNorm[id,row.names(tsne)]+1) )
  df <- df[order(df$exp,decreasing = FALSE),]
  
  par(mfrow=c(1,2),mar=c(8,4,8,4))
  plot(df$x, df$y, pch=16, cex=0.75, xlab="", ylab="", main=gene, col=bluefunc(10)[findInterval(df$exp, seq(min(df$exp), max(df$exp), (max(df$exp)-min(df$exp))/10))], axes=F)
  box(bty="l")
  
  xl <- min(df$x)+abs(min(df$x)/5); xr <- max(df$x)-max(df$x)/5; 
  yt <- min(df$y)-abs(min(df$y)/5); yb <- yt-((max(df$y)-min(df$y))/35); 
  
  rect(head(seq(xl,xr,(xr-xl)/10),-1), yb, tail(seq(xl,xr,(xr-xl)/10),-1), yt,
       col=bluefunc(10), border=bluefunc(10), lwd=0.5, xpd=NA)
  rect(xl,yb,xr,yt, xpd=NA)
  segments(seq(xl,xr,((xr-xl)/5)),yb,seq(xl,xr,((xr-xl)/5)),yb-((max(df$y)-min(df$y))/35), xpd=NA)
  text(seq(xl,xr,((xr-xl)/5)), yb-((max(df$y)-min(df$y))/35)-1, labels = round(seq(min(df$exp), max(df$exp), (max(df$exp)-min(df$exp))/5),1), cex=0.85, xpd=NA)
  text(xl+(xr-xl)/2, yb-((max(df$y)-min(df$y))/35)-3, labels = expression('log'[2]*' counts + 1'), xpd=NA)
  
  par(mar=c(8,6,8,2))
  boxplot(df$exp~clusters[row.names(df)], col=as.character(levels(as.factor(clusters[row.names(df)]))), main=gene, ylab=expression('log'[2]*' counts + 1'), axes=FALSE)
  box(bty="l"); axis(2, las=2)
  text(x = 1:length(unique(clusters[row.names(df)])), y = min(df$exp)-0.5, srt = 45, adj=1, labels = levels(as.factor(clusters[row.names(df)])), xpd = NA)
}

We start from the batch corrected, normalised counts produced in the 02_batchCorrection.Rmd script.

## normalised, batch corrected counts
sce.corr <- readRDS(paste0(dir, "data/sce.goodQual.NORM.batchCorrected.Rds"))

## HVGs
hvgs <- read.table(paste0(dir, "results/HVGs_minMean1_FDR0.05.tsv"), stringsAsFactors = FALSE)
hvgs <- hvgs$V1

## UMAP
umap <- read.table(paste0(dir, "results/umapCoords_corrected.tab"))

The next step is to cluster the cells to define the cellular diversity of the dataset. We use the distances between cells based on expression of HVGs on the batch corrected data. Clusters are defined through hierarchical clustering and a dynamic tree cut.

## use the distance between cells to identify clusters
dat <- assay(sce.corr)[hvgs,]
test.dist <- dist(t(dat))

## define clusters by hierarchical clustering and dynamic tree cut
test.clust <- hclust(test.dist, method="average")
cut <- cutreeDynamic(test.clust, distM=as.matrix(test.dist), minClusterSize=40, method="hybrid", deepSplit = 1, verbose = 0)
sce.corr$cluster <- cut

names(cut) <- colnames(dat)
write.table(cut, paste0(dir, "results/clusters_average_min40.tsv"), quote = FALSE, sep="\t", col.names = FALSE)

stopifnot(identical(names(cut), row.names(umap)))
umap$cluster <- cut
o <- order(umap$cluster)
plot(umap$x[o], umap$y[o], pch=16, cex=0.75, col=umap$cluster[o], xlab="UMAP - dim1", ylab="UMAP - dim2", bty="l")

This procedure results in 12 clusters.

table(cut)[-1]
## cut
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 713 514 405 355 260 221 194 184  89  65  59  45

Importantly, clusters are comprised of cells from all different batches.

tmp <- sce.corr[,-which(sce.corr$cluster==0)]
table(batch=tmp$batch, cluster=tmp$cluster)
##          cluster
## batch       1   2   3   4   5   6   7   8   9  10  11  12
##   batch_1  21   1   6   0   4   1   0   1   0   4   1   0
##   batch_2  83  57  34  23  16  10  26  16  17   9   0   1
##   batch_3  63  57  16  45  11  26  42  10  13   1   4   2
##   batch_4 149 133 110  71  43  33  41  19  12  18  20   4
##   batch_5 210  71 104  69  82  61  25   4   9  21   5  18
##   batch_6 130 101  62 106  59  56  23  34  12  10   6  16
##   batch_7  57  94  73  41  45  34  37 100  26   2  23   4
# barplot(t(t(table(batch=tmp$batch, cluster=tmp$cluster))/colSums(table(batch=tmp$batch, cluster=tmp$cluster))*100), col=1:12)

Finally, we check the QC statistics to make sure that there isn’t a cluster that behaves abnormally.

qc <- read.table(paste0(dir, "data/QCstats_allCells.tsv"))
qc <- qc[names(cut[cut!=0]),]

par(mfrow=c(2,2))
boxplot(log10(qc$libSize)~cut[cut!=0], las=2, xlab="cluster", ylab=expression('log'[10]*' library size'), col=1:12)
boxplot(qc$nGenes/1e3~cut[cut!=0], las=2, xlab="cluster", ylab="number of genes expressed x 1000", col=1:12)
boxplot(qc$mit/qc$libSize*100~cut[cut!=0], las=2, xlab="cluster", ylab="% in MT", col=1:12)
boxplot(qc$ercc/qc$libSize*100~cut[cut!=0], las=2, xlab="cluster", ylab="% in ERCC", col=1:12)

Cluster markers

To get an initial idea of the identity of each cluster, we use the findMarkers function to identify genes with large fold-changes in a particular cluster compared to the rest. We require genes to be significant in all comparisons against all other clusters; this returns genes that are most specific to each cluster. But can be problematic if there are some closely related clusters that share markers.

This approach returns significant genes for all but one cluster.

## use normalised counts and block by batch instead of using batch corrected counts
sce <- readRDS(paste0(dir, "data/sce_goodQual.NORM.Rds"))

## add cluster info
stopifnot(identical(colnames(logcounts(sce)), names(cut)))
sce$clusters <- cut

## remove outlier cell
sce <- sce[,sce$clusters>0]

## find markers
keep <- rowMeans(logcounts(sce)) > 0.1
markersDE <- findMarkers(sce, groups = sce$clusters, block=sce$batch, direction="up", subset.row=keep, pval.type="all")
unlist(lapply(markersDE, function(x) sum(x$FDR<0.05)))
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 450   0  82 107 324  11  29  48  62  24 296 803
saveRDS(markersDE, file=paste0(dir, "results/markerGenes_pval_all.Rds"))

To recover markers for cluster 2, we instead require the gene to be significantly different against at least 8 of the 12 clusters. This of course returns higher numbers of significant genes.

markersDE.some <- findMarkers(sce, groups = sce$clusters, block=sce$batch, direction="up", subset.row=keep, pval.type="some", min.prop=0.75)
unlist(lapply(markersDE.some, function(x) sum(x$FDR<0.05)))
##    1    2    3    4    5    6    7    8    9   10   11   12 
## 1348  116  722  427 1013  370  244  287  191  282  715 1322
saveRDS(markersDE, file=paste0(dir, "results/markerGenes_pval_some0.75.Rds"))

Below is the expression of the top 10 genes found for each cluster:

th <- theme_bw() + theme(axis.text.x = element_text(size=12), axis.title.x = element_text(size=12), axis.text.y = element_text(size=12), axis.title.y = element_text(size=12), axis.ticks.x = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), panel.border = element_blank(), plot.title = element_text(face="bold", hjust = 0.5))

plotGeneOnUMAP <- function(umap=umap, data=sce, clusters=clusters, gene=gene){
  df <- data.frame(x=umap$x, y=umap$y, expr=as.numeric(logcounts(sce[gene,])), cluster=clusters)
  df <- df[df$cluster>0,]
  df <- df[order(df$expr),]
  
  p <- list()
  p[[1]] <- ggplot(df, aes(x,y)) + geom_point(aes(colour=expr), alpha = 0.5) + scale_colour_gradientn(colours = colorRampPalette(c("grey", "lightblue", "dodgerblue4", "royalblue4"))(100)) + ggtitle(rowData(sce)[id,1]) + xlab("") + ylab("") + labs(colour=expression('log'[2]*' counts')) + th + theme(legend.position = "none") + guides(colour = guide_colorbar(title.position = "bottom"))
  p[[2]] <- ggplot(df, aes(as.factor(cluster), expr)) + geom_boxplot(fill=1:12) + ggtitle(rowData(sce)[id,1]) + xlab("") + ylab("") + th
  return(p)
}

Cluster 1

cluster <- 1
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 2

cluster <- 2
for(i in 1:10){
  id <- row.names(markersDE.some[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 3

cluster <- 3
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 4

cluster <- 4
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 5

cluster <- 5
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 6

cluster <- 6
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 7

cluster <- 7
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 8

cluster <- 8
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 9

cluster <- 9
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 10

cluster <- 10
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 11

cluster <- 11
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

Cluster 12

cluster <- 12
for(i in 1:10){
  id <- row.names(markersDE[[cluster]])[i]
  plots <- plotGeneOnUMAP(umap = umap[-which(cut==0),], data = sce, clusters = cut[cut>0], gene = id)
  print(ggarrange(plotlist = plots, ncol=2, nrow=1))
}

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.2.3                magrittr_1.5               
##  [3] ggplot2_3.2.1               RColorBrewer_1.1-2         
##  [5] dynamicTreeCut_1.63-1       scran_1.14.1               
##  [7] SingleCellExperiment_1.8.0  SummarizedExperiment_1.16.0
##  [9] DelayedArray_0.12.0         BiocParallel_1.20.0        
## [11] matrixStats_0.55.0          Biobase_2.46.0             
## [13] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0        
## [15] IRanges_2.20.0              S4Vectors_0.24.0           
## [17] BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.2               rsvd_1.0.2              
##  [3] locfit_1.5-9.1           lattice_0.20-38         
##  [5] assertthat_0.2.1         digest_0.6.22           
##  [7] R6_2.4.0                 evaluate_0.14           
##  [9] pillar_1.4.2             zlibbioc_1.32.0         
## [11] rlang_0.4.1              lazyeval_0.2.2          
## [13] rstudioapi_0.10          irlba_2.3.3             
## [15] Matrix_1.2-17            rmarkdown_1.16          
## [17] labeling_0.3             BiocNeighbors_1.4.1     
## [19] statmod_1.4.32           stringr_1.4.0           
## [21] igraph_1.2.4.1           RCurl_1.95-4.12         
## [23] munsell_0.5.0            compiler_3.6.1          
## [25] vipor_0.4.5              BiocSingular_1.2.0      
## [27] xfun_0.10                pkgconfig_2.0.3         
## [29] ggbeeswarm_0.6.0         htmltools_0.4.0         
## [31] tidyselect_0.2.5         gridExtra_2.3           
## [33] tibble_2.1.3             GenomeInfoDbData_1.2.2  
## [35] edgeR_3.28.0             viridisLite_0.3.0       
## [37] withr_2.1.2              crayon_1.3.4            
## [39] dplyr_0.8.3              bitops_1.0-6            
## [41] grid_3.6.1               gtable_0.3.0            
## [43] scales_1.0.0             dqrng_0.2.1             
## [45] stringi_1.4.3            ggsignif_0.6.0          
## [47] XVector_0.26.0           viridis_0.5.1           
## [49] limma_3.42.0             scater_1.14.1           
## [51] DelayedMatrixStats_1.8.0 cowplot_1.0.0           
## [53] tools_3.6.1              glue_1.3.1              
## [55] beeswarm_0.2.3           purrr_0.3.3             
## [57] yaml_2.2.0               colorspace_1.4-1        
## [59] knitr_1.25